library(tidyverse)
library(dplyr)
library(janitor)
So hb_geo_codes_14_19 is simply the health board codes for each of the health boards in scotland. I have to do join it to health_board_incidences and then filter for NHS Borders Or not even, just look up the hb code for NHS Borders in 1st and filter by that in the 2nd.
hb_geo_codes_14_19 %>%
filter(hb_name == "NHS Borders")
Questions I can think to ask, off the bat:
So off the bat here I’m seeing that All Cancer Types may be making this data less accessible.
So I’m going to run the same code but filter out “All cancer types” to see what happens
unique(health_board_incidences_Borders$cancer_site)
[1] "All cancer types" "Bladder"
[3] "Bone and articular cartilage" "Bone and connective tissue"
[5] "Connective tissue" "Malignant brain cancer"
[7] "Malig brain ca (incl pit. gland, cranio. duct, pineal gland)" "Non-malig brain ca (incl pit.gland,cranio.duct,pineal gland)"
[9] "All brain and CNS tumours (malignant and non-malignant)" "Breast"
[11] "Carcinoma in situ of the breast" "Colorectal cancer"
[13] "Colon" "Rectum and rectosigmoid junction"
[15] "Cervix uteri" "Carcinoma in situ of the cervix uteri"
[17] "Corpus uteri" "Ovary"
[19] "Uterus" "Vagina"
[21] "Vulva" "Head and neck"
[23] "Larynx" "Lip, oral cavity and pharynx"
[25] "Mouth (IARC definition)" "Oral cavity"
[27] "Salivary glands" "Thyroid"
[29] "Tongue" "Oropharyngeal cancers"
[31] "Hodgkin lymphoma" "Kidney"
[33] "Leukaemias" "Acute lymphoblastic leukaemia"
[35] "Acute myeloid leukaemia" "Chronic lymphocytic leukaemia"
[37] "Chronic myeloid leukaemia" "Liver and intrahepatic bile ducts"
[39] "Trachea, bronchus and lung" "Mesothelioma"
[41] "Penis" "Prostate"
[43] "Testis" "Multiple myeloma and malignant plasma cell neoplasms"
[45] "Non-Hodgkin lymphoma" "Oesophagus"
[47] "Pancreas" "Malignant melanoma of the skin"
[49] "Non-melanoma skin cancer" "Basal cell carcinoma of the skin"
[51] "Squamous cell carcinoma of the skin" "Stomach"
cancers_1
[1] "All cancer types" "Bladder"
[3] "Bone and articular cartilage" "Bone and connective tissue"
[5] "Connective tissue" "Malignant brain cancer"
[7] "Malig brain ca (incl pit. gland, cranio. duct, pineal gland)" "Non-malig brain ca (incl pit.gland,cranio.duct,pineal gland)"
[9] "All brain and CNS tumours (malignant and non-malignant)" "Breast"
[11] "Carcinoma in situ of the breast" "Colorectal cancer"
[13] "Colon" "Rectum and rectosigmoid junction"
[15] "Cervix uteri" "Carcinoma in situ of the cervix uteri"
[17] "Corpus uteri" "Ovary"
[19] "Uterus" "Vagina"
[21] "Vulva" "Head and neck"
[23] "Larynx" "Lip, oral cavity and pharynx"
[25] "Mouth (IARC definition)" "Oral cavity"
[27] "Salivary glands" "Thyroid"
[29] "Tongue" "Oropharyngeal cancers"
[31] "Hodgkin lymphoma" "Kidney"
[33] "Leukaemias" "Acute lymphoblastic leukaemia"
[35] "Acute myeloid leukaemia" "Chronic lymphocytic leukaemia"
[37] "Chronic myeloid leukaemia" "Liver and intrahepatic bile ducts"
[39] "Trachea, bronchus and lung" "Mesothelioma"
[41] "Penis" "Prostate"
[43] "Testis" "Multiple myeloma and malignant plasma cell neoplasms"
[45] "Non-Hodgkin lymphoma" "Oesophagus"
[47] "Pancreas" "Malignant melanoma of the skin"
[49] "Non-melanoma skin cancer" "Basal cell carcinoma of the skin"
[51] "Squamous cell carcinoma of the skin" "Stomach"
unique(health_board_incidences_Borders$sex)
[1] "All" "Male" "Female"
health_board_incidences_Borders %>%
filter(cancer_site == "Breast") %>%
group_by(year)
health_board_incidences_Borders
health_board_incidences_Borders %>%
tidyr::gather()
filter(sex == "All",
cancer_site != "All cancer types") %>%
ggplot(aes(x = year, y = incidences_all_ages)) +
geom_line(color = "#800080") +
facet_wrap(~ cancer_site)
unique(health_board_incidences_Borders$year)
[1] 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
[20] 2013 2014 2015 2016 2017 2018
health_board_incidences_Borders %>%
filter(cancer_site != "All cancer types") %>%
group_by(cancer_site) %>%
arrange(desc(incidences_all_ages))
health_board_incidences_Borders %>%
dplyr::select(-crude_rate_lower95pc_confidence_interval, -crude_rate_upper95pc_confidence_interval, -sex_qf, cancer_site_icd10code, -easr_lower95pc_confidence_interval, -easr_lower95pc_confidence_interval_qf, -easr_upper95pc_confidence_interval, -easr_upper95pc_confidence_interval_qf, -wasr_lower95pc_confidence_interval, -wasr_lower95pc_confidence_interval_qf, -wasr_upper95pc_confidence_interval, -wasr_upper95pc_confidence_interval_qf, -sir_lower95pc_confidence_interval, -sir_upper95pc_confidence_interval) %>%
pivot_longer(cols = c("incidences_all_ages", "crude_rate", "easr",),
names_to = "metric",
values_to = "value")
read_csv("raw_data/borders_population.csv") %>%
clean_names() %>%
group_by(date_code) %>%
summarise(total = sum(value))
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
FeatureCode = col_character(),
DateCode = col_double(),
Measurement = col_character(),
Units = col_character(),
Value = col_double(),
Age = col_character(),
Sex = col_character()
)
`summarise()` ungrouping output (override with `.groups` argument)
borders_population <-
read_csv("raw_data/borders_population.csv") %>%
clean_names() %>%
filter(feature_code == "S08000016",
age == "All",
sex == "All") %>%
mutate(population = value) %>%
dplyr::select(-measurement, -units, -age, -sex, -value)
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
FeatureCode = col_character(),
DateCode = col_double(),
Measurement = col_character(),
Units = col_character(),
Value = col_double(),
Age = col_character(),
Sex = col_character()
)
borders_population
health_board_incidences_Borders_population <-
inner_join(borders_population, health_board_incidences_Borders, by = c("date_code" = "year")) %>%
dplyr::select(-crude_rate_lower95pc_confidence_interval, -crude_rate_upper95pc_confidence_interval, -sex_qf, cancer_site_icd10code, -easr_lower95pc_confidence_interval, -easr_lower95pc_confidence_interval_qf, -easr_upper95pc_confidence_interval, -easr_upper95pc_confidence_interval_qf, -wasr_lower95pc_confidence_interval, -wasr_lower95pc_confidence_interval_qf, -wasr_upper95pc_confidence_interval, -wasr_upper95pc_confidence_interval_qf, -sir_lower95pc_confidence_interval, -sir_upper95pc_confidence_interval) %>%
mutate(per_hundred_thousand = incidences_all_ages/population * 100000)
health_board_incidences_Borders_population
health_board_incidences_Borders_population %>%
pivot_longer(cols = c("incidences_all_ages", "crude_rate", "easr",),
names_to = "metric",
values_to = "value") %>%
group_by(cancer_site) %>%
ggplot() +
aes(x = date_code, y = value, fill = metric) +
geom_line()
NA
health_board_incidences_Borders_lean <-
health_board_incidences_Borders %>%
dplyr::select(-crude_rate_lower95pc_confidence_interval, -crude_rate_upper95pc_confidence_interval, -sex_qf, cancer_site_icd10code, -easr_lower95pc_confidence_interval, -easr_lower95pc_confidence_interval_qf, -easr_upper95pc_confidence_interval, -easr_upper95pc_confidence_interval_qf, -wasr_lower95pc_confidence_interval, -wasr_lower95pc_confidence_interval_qf, -wasr_upper95pc_confidence_interval, -wasr_upper95pc_confidence_interval_qf, -sir_lower95pc_confidence_interval, -sir_upper95pc_confidence_interval)
health_board_incidences_Borders_lean %>%
filter(year >= 2005,
cancer_site != "All cancer types") %>%
group_by(cancer_site) %>%
ggplot() +
aes(x = reorder(cancer_site, incidences_all_ages), y = incidences_all_ages) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
geom_col(fill = "#800020") +
scale_x_discrete() +
labs(x = "Type of Cancer", y = "Incidences") +
coord_flip()
So I’m going with the last ten years of data, which seems like more than enough for the topic.
health_board_incidences_Borders_lean %>%
filter(year >= 2005,
cancer_site != "All cancer types",
sex == "Male") %>%
group_by(cancer_site) %>%
arrange(desc(incidences_all_ages))
health_board_incidences_Borders %>%
filter(cancer_site == "Non-melanoma skin cancer" | cancer_site == "Basal cell carcinoma of the skin" | cancer_site == "Squamous cell carcinoma of the skin" ,
year >= 2005) %>%
summarise(sum = sum(incidences_all_ages))
health_board_incidences_Borders %>%
filter(cancer_site == "Rectum and rectosigmoid junction" | cancer_site == "Colon" | cancer_site == "Colorectal cancer" ,
year >= 2005) %>%
summarise(sum = sum(incidences_all_ages))
sort(cancers)
[1] "Acute lymphoblastic leukaemia" "Acute myeloid leukaemia"
[3] "All brain and CNS tumours (malignant and non-malignant)" "All cancer types"
[5] "Basal cell carcinoma of the skin" "Bladder"
[7] "Bone and articular cartilage" "Bone and connective tissue"
[9] "Breast" "Carcinoma in situ of the breast"
[11] "Carcinoma in situ of the cervix uteri" "Cervix uteri"
[13] "Chronic lymphocytic leukaemia" "Chronic myeloid leukaemia"
[15] "Colon" "Colorectal cancer"
[17] "Connective tissue" "Corpus uteri"
[19] "Head and neck" "Hodgkin lymphoma"
[21] "Kidney" "Larynx"
[23] "Leukaemias" "Lip, oral cavity and pharynx"
[25] "Liver and intrahepatic bile ducts" "Malig brain ca (incl pit. gland, cranio. duct, pineal gland)"
[27] "Malignant brain cancer" "Malignant melanoma of the skin"
[29] "Mesothelioma" "Mouth (IARC definition)"
[31] "Multiple myeloma and malignant plasma cell neoplasms" "Non-Hodgkin lymphoma"
[33] "Non-malig brain ca (incl pit.gland,cranio.duct,pineal gland)" "Non-melanoma skin cancer"
[35] "Oesophagus" "Oral cavity"
[37] "Oropharyngeal cancers" "Ovary"
[39] "Pancreas" "Penis"
[41] "Prostate" "Rectum and rectosigmoid junction"
[43] "Salivary glands" "Squamous cell carcinoma of the skin"
[45] "Stomach" "Testis"
[47] "Thyroid" "Tongue"
[49] "Trachea, bronchus and lung" "Uterus"
[51] "Vagina" "Vulva"
unique(health_board_incidences_Borders$cancer_site_icd10code)
[1] "C00-C97, excluding C44" "C67"
[3] "C40-C41" "ICD-10 C40-C41, C47+C49"
[5] "ICD-10 C47+C49" "C71"
[7] "C70-C72, C75.1-C75.3" "D18.0, D32-D33, D35.2-D35.4, D42-D43, D44.3-D44.5"
[9] "C70-C72, C75.1-C75.3, D18.0, D32-D33, D35.2-D35.4, D42-D43, D44.3-D44.5" "C50"
[11] "D05" "C18-C20"
[13] "C18" "C19-C20"
[15] "C53" "D06"
[17] "C54" "C56"
[19] "C53-C55" "C52"
[21] "C51" "C00-C14, C30-C32"
[23] "C32" "C00-C14"
[25] "C03-C06" "C01-C06"
[27] "C07-C08" "C73"
[29] "C01-C02" "C01, C02.4, C05.1, C05.2, C09, C10"
[31] "C81" "C64-C65"
[33] "C91-C95" "C91.0"
[35] "C92.0" "C91.1"
[37] "C92.1-C92.2" "C22"
[39] "C33-C34" "C45"
[41] "C60" "C61"
[43] "C62" "C90"
[45] "C82-C86" "C15"
[47] "C25" "C43"
[49] "C44" "C44, M-8090-8098"
[51] "C44, M-8050-8078, M-8083-8084" "C16"
health_board_incidences_Borders_icd <-
health_board_incidences_Borders %>%
#filter out the more narrow ranges, there's colorectal 18-20 so i don't need 18 or 19-20. Same with c44, it includes both squamous and basal
filter(cancer_site_icd10code != "C19-C20",
cancer_site_icd10code != "C18",
cancer_site_icd10code != "C92.1-C92.2",
#skin cancer
cancer_site_icd10code != "C44, M-8050-8078, M-8083-8084",
cancer_site_icd10code != "C44, M-8090-8098",
#leukemia
cancer_site_icd10code != "C92.1-C92.2",
cancer_site_icd10code != "C01, C02.4, C05.1, C05.2, C09, C10",
cancer_site_icd10code != "C00-C14",
cancer_site_icd10code != "C01, C02.4, C05.1, C05.2, C09, C10",
cancer_site_icd10code != "C01, C02.4, C05.1, C05.2, C09, C10",
cancer_site_icd10code != "C00-C14, C30-C32 ",
cancer_site_icd10code != "C00-C97, excluding C44 ",
#keeping 01-06, which is oral cancer
cancer_site_icd10code != "C01-C02",
cancer_site_icd10code != "C03-C06",
#keeping 53-55, which is Uterine cancer
cancer_site_icd10code != "C53",
cancer_site_icd10code != "C54",
#not sure what this is but removing duplicate not including the D numbers
cancer_site_icd10code != "C70-C72, C75.1-C75.3",
cancer_site_icd10code != "C91.0",
cancer_site_icd10code != "C91.1",
cancer_site_icd10code != "C92.0",
)
health_board_incidences_Borders_icd %>%
distinct(cancer_site_icd10code) %>%
arrange(cancer_site_icd10code)
health_board_incidences_Borders_icd %>%
filter(year >= 2005,
cancer_site != "All cancer types") %>%
ggplot() +
aes(x = reorder(cancer_site, -incidences_all_ages), y = incidences_all_ages) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
geom_col(fill = "#800020") +
scale_x_discrete() +
labs(x = "Type of Cancer", y = "Incidences") +
coord_flip()
sort(unique(health_board_incidences_Borders_icd$cancer_site))
[1] "All brain and CNS tumours (malignant and non-malignant)" "All cancer types"
[3] "Bladder" "Bone and articular cartilage"
[5] "Bone and connective tissue" "Breast"
[7] "Carcinoma in situ of the breast" "Carcinoma in situ of the cervix uteri"
[9] "Colorectal cancer" "Connective tissue"
[11] "Head and neck" "Hodgkin lymphoma"
[13] "Kidney" "Larynx"
[15] "Leukaemias" "Liver and intrahepatic bile ducts"
[17] "Malignant brain cancer" "Malignant melanoma of the skin"
[19] "Mesothelioma" "Multiple myeloma and malignant plasma cell neoplasms"
[21] "Non-Hodgkin lymphoma" "Non-malig brain ca (incl pit.gland,cranio.duct,pineal gland)"
[23] "Oesophagus" "Oral cavity"
[25] "Ovary" "Pancreas"
[27] "Penis" "Prostate"
[29] "Salivary glands" "Stomach"
[31] "Testis" "Thyroid"
[33] "Trachea, bronchus and lung" "Uterus"
[35] "Vagina" "Vulva"
health_board_incidences_Borders_icd %>%
filter(year >= 2005,
cancer_site != "All cancer types",
sex == "All") %>%
group_by(cancer_site) %>%
summarise(incidences_2018 = sum(incidences_all_ages)) %>%
arrange(desc(incidences_2018)) %>%
ggplot() +
aes(x = reorder(cancer_site, incidences_2018), y = incidences_2018) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
geom_col(fill = "#800020") +
scale_x_discrete() +
labs(x = "Type of Cancer", y = "Since 2005") +
coord_flip()
`summarise()` ungrouping output (override with `.groups` argument)
NA
NA
NA
health_board_incidences_Borders_icd %>%
filter(year == 2018,
cancer_site != "All cancer types",
sex == "All") %>%
group_by(cancer_site) %>%
summarise(incidences_2018 = sum(incidences_all_ages)) %>%
arrange(desc(incidences_2018)) %>%
ggplot() +
aes(x = reorder(cancer_site, incidences_2018), y = incidences_2018) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
geom_col(fill = "#800020") +
scale_x_discrete() +
labs(x = "Type of Cancer", y = "2018") +
coord_flip()
`summarise()` ungrouping output (override with `.groups` argument)
NA
NA
NA
health_board_incidences_Borders_icd %>%
filter(year == 2018,
sex == "All") %>%
group_by(cancer_site) %>%
summarise(incidences_since_2005 = sum(incidences_all_ages)) %>%
arrange(desc(incidences_since_2005))
health_board_incidences_Borders_icd %>%
filter(year == 2018,
cancer_site != "All cancer types",
sex == "Female") %>%
group_by(cancer_site) %>%
summarise(incidences_2018 = sum(incidences_all_ages)) %>%
arrange(desc(incidences_2018)) %>%
ggplot() +
aes(x = reorder(cancer_site, incidences_2018), y = incidences_2018) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
geom_col(fill = "#800020") +
scale_x_discrete() +
labs(x = "Type of Cancer", y = "2018") +
coord_flip()
`summarise()` ungrouping output (override with `.groups` argument)
NA
NA
health_board_incidences_Borders_icd %>%
filter(year == 2018,
cancer_site != "All cancer types",
sex == "Male") %>%
group_by(cancer_site) %>%
summarise(incidences_2018 = sum(incidences_all_ages)) %>%
arrange(desc(incidences_2018)) %>%
ggplot() +
aes(x = reorder(cancer_site, incidences_2018), y = incidences_2018) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
geom_col(fill = "#800020") +
scale_x_discrete() +
labs(x = "Type of Cancer", y = "2018") +
coord_flip()
`summarise()` ungrouping output (override with `.groups` argument)
NA
NA